{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 本周实践目标\n",
    "<mark> 如何有系统的把更多页数据(相同结构)作系统性爬取 </mark>[猎聘PC版](https://www.liepin.com/zhaopin/)\n",
    "* 翻页：参数字典的拆解\n",
    "  * xpath解析翻页a/@href\n",
    "  * 建构参数模板\n",
    "  * 建构参数字典\n",
    "* 翻页：系统性迭代\n",
    "  * robots.txt\n",
    "  * 频率及时间\n",
    "* 翻页：数据备份与整合\n",
    "  * 储存备份\n",
    "  * 数据整合"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 翻页：参数字典拆解"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 建立连接及获取URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<Element 'a' href='/zhaopin/?init=-1&headckid=2a8c40e4fb5e770b&fromSearchBtn=2&ckid=2a8c40e4fb5e770b°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=2b90937accebab966207ffccd7906c6c&d_curPage=0&d_pageSize=40&d_headId=2b90937accebab966207ffccd7906c6c&curPage=1'>,\n",
       " <Element 'a' href='/zhaopin/?init=-1&headckid=2a8c40e4fb5e770b&fromSearchBtn=2&ckid=2a8c40e4fb5e770b°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=2b90937accebab966207ffccd7906c6c&d_curPage=0&d_pageSize=40&d_headId=2b90937accebab966207ffccd7906c6c&curPage=2'>,\n",
       " <Element 'a' href='/zhaopin/?init=-1&headckid=2a8c40e4fb5e770b&fromSearchBtn=2&ckid=2a8c40e4fb5e770b°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=2b90937accebab966207ffccd7906c6c&d_curPage=0&d_pageSize=40&d_headId=2b90937accebab966207ffccd7906c6c&curPage=3'>,\n",
       " <Element 'a' href='/zhaopin/?init=-1&headckid=2a8c40e4fb5e770b&fromSearchBtn=2&ckid=2a8c40e4fb5e770b°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=2b90937accebab966207ffccd7906c6c&d_curPage=0&d_pageSize=40&d_headId=2b90937accebab966207ffccd7906c6c&curPage=4'>,\n",
       " <Element 'a' href='/zhaopin/?init=-1&headckid=2a8c40e4fb5e770b&fromSearchBtn=2&ckid=2a8c40e4fb5e770b°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=2b90937accebab966207ffccd7906c6c&d_curPage=0&d_pageSize=40&d_headId=2b90937accebab966207ffccd7906c6c&curPage=1'>,\n",
       " <Element 'a' class=('last',) href='/zhaopin/?init=-1&headckid=2a8c40e4fb5e770b&fromSearchBtn=2&ckid=2a8c40e4fb5e770b°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=2b90937accebab966207ffccd7906c6c&d_curPage=0&d_pageSize=40&d_headId=2b90937accebab966207ffccd7906c6c&curPage=9' title='末页'>]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from requests_html import HTMLSession\n",
    "session = HTMLSession()\n",
    "url = 'https://www.liepin.com/zhaopin/'\n",
    "r = session.get(url)\n",
    "翻页url = r.html.xpath(\n",
    "    '//div[@class=\"pagerbar\"]/a[starts-with(@href,\"/zhaopin\")]')\n",
    "翻页url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['2', '3', '4', '5', '下一页', '']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#输出URL\n",
    "页数名=[x.text   for x in 翻页url]\n",
    "页数名"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['/zhaopin/?init=-1&headckid=3535f118c68812b6&fromSearchBtn=2&ckid=3535f118c68812b6°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=228b6c5eec1f026d1452c12596a79253&d_curPage=0&d_pageSize=40&d_headId=228b6c5eec1f026d1452c12596a79253&curPage=1',\n",
       " '/zhaopin/?init=-1&headckid=3535f118c68812b6&fromSearchBtn=2&ckid=3535f118c68812b6°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=228b6c5eec1f026d1452c12596a79253&d_curPage=0&d_pageSize=40&d_headId=228b6c5eec1f026d1452c12596a79253&curPage=2',\n",
       " '/zhaopin/?init=-1&headckid=3535f118c68812b6&fromSearchBtn=2&ckid=3535f118c68812b6°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=228b6c5eec1f026d1452c12596a79253&d_curPage=0&d_pageSize=40&d_headId=228b6c5eec1f026d1452c12596a79253&curPage=3',\n",
       " '/zhaopin/?init=-1&headckid=3535f118c68812b6&fromSearchBtn=2&ckid=3535f118c68812b6°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=228b6c5eec1f026d1452c12596a79253&d_curPage=0&d_pageSize=40&d_headId=228b6c5eec1f026d1452c12596a79253&curPage=4',\n",
       " '/zhaopin/?init=-1&headckid=3535f118c68812b6&fromSearchBtn=2&ckid=3535f118c68812b6°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=228b6c5eec1f026d1452c12596a79253&d_curPage=0&d_pageSize=40&d_headId=228b6c5eec1f026d1452c12596a79253&curPage=1',\n",
       " '/zhaopin/?init=-1&headckid=3535f118c68812b6&fromSearchBtn=2&ckid=3535f118c68812b6°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=228b6c5eec1f026d1452c12596a79253&d_curPage=0&d_pageSize=40&d_headId=228b6c5eec1f026d1452c12596a79253&curPage=9']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "页数url=[x.xpath(\"//@href\")[0]for x in 翻页url]\n",
    "页数url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'2': '/zhaopin/?init=-1&headckid=3535f118c68812b6&fromSearchBtn=2&ckid=3535f118c68812b6°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=228b6c5eec1f026d1452c12596a79253&d_curPage=0&d_pageSize=40&d_headId=228b6c5eec1f026d1452c12596a79253&curPage=1',\n",
       " '3': '/zhaopin/?init=-1&headckid=3535f118c68812b6&fromSearchBtn=2&ckid=3535f118c68812b6°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=228b6c5eec1f026d1452c12596a79253&d_curPage=0&d_pageSize=40&d_headId=228b6c5eec1f026d1452c12596a79253&curPage=2',\n",
       " '4': '/zhaopin/?init=-1&headckid=3535f118c68812b6&fromSearchBtn=2&ckid=3535f118c68812b6°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=228b6c5eec1f026d1452c12596a79253&d_curPage=0&d_pageSize=40&d_headId=228b6c5eec1f026d1452c12596a79253&curPage=3',\n",
       " '5': '/zhaopin/?init=-1&headckid=3535f118c68812b6&fromSearchBtn=2&ckid=3535f118c68812b6°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=228b6c5eec1f026d1452c12596a79253&d_curPage=0&d_pageSize=40&d_headId=228b6c5eec1f026d1452c12596a79253&curPage=4',\n",
       " '下一页': '/zhaopin/?init=-1&headckid=3535f118c68812b6&fromSearchBtn=2&ckid=3535f118c68812b6°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=228b6c5eec1f026d1452c12596a79253&d_curPage=0&d_pageSize=40&d_headId=228b6c5eec1f026d1452c12596a79253&curPage=1',\n",
       " '': '/zhaopin/?init=-1&headckid=3535f118c68812b6&fromSearchBtn=2&ckid=3535f118c68812b6°radeFlag=0&siTag=1B2M2Y8AsgTpgAmY7PhCfg%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_unknown&d_ckId=228b6c5eec1f026d1452c12596a79253&d_curPage=0&d_pageSize=40&d_headId=228b6c5eec1f026d1452c12596a79253&curPage=9'}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "页面url ={x.text:x.xpath(\"//@href\")[0] for x in 翻页url}\n",
    "页面url"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 拆解URL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>init</th>\n",
       "      <th>headckid</th>\n",
       "      <th>fromSearchBtn</th>\n",
       "      <th>ckid</th>\n",
       "      <th>siTag</th>\n",
       "      <th>d_sfrom</th>\n",
       "      <th>d_ckId</th>\n",
       "      <th>d_curPage</th>\n",
       "      <th>d_pageSize</th>\n",
       "      <th>d_headId</th>\n",
       "      <th>curPage</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-1</td>\n",
       "      <td>3535f118c68812b6</td>\n",
       "      <td>2</td>\n",
       "      <td>3535f118c68812b6°radeFlag=0</td>\n",
       "      <td>1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>228b6c5eec1f026d1452c12596a79253</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>228b6c5eec1f026d1452c12596a79253</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-1</td>\n",
       "      <td>3535f118c68812b6</td>\n",
       "      <td>2</td>\n",
       "      <td>3535f118c68812b6°radeFlag=0</td>\n",
       "      <td>1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>228b6c5eec1f026d1452c12596a79253</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>228b6c5eec1f026d1452c12596a79253</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-1</td>\n",
       "      <td>3535f118c68812b6</td>\n",
       "      <td>2</td>\n",
       "      <td>3535f118c68812b6°radeFlag=0</td>\n",
       "      <td>1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>228b6c5eec1f026d1452c12596a79253</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>228b6c5eec1f026d1452c12596a79253</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-1</td>\n",
       "      <td>3535f118c68812b6</td>\n",
       "      <td>2</td>\n",
       "      <td>3535f118c68812b6°radeFlag=0</td>\n",
       "      <td>1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>228b6c5eec1f026d1452c12596a79253</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>228b6c5eec1f026d1452c12596a79253</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-1</td>\n",
       "      <td>3535f118c68812b6</td>\n",
       "      <td>2</td>\n",
       "      <td>3535f118c68812b6°radeFlag=0</td>\n",
       "      <td>1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>228b6c5eec1f026d1452c12596a79253</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>228b6c5eec1f026d1452c12596a79253</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>-1</td>\n",
       "      <td>3535f118c68812b6</td>\n",
       "      <td>2</td>\n",
       "      <td>3535f118c68812b6°radeFlag=0</td>\n",
       "      <td>1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw</td>\n",
       "      <td>search_unknown</td>\n",
       "      <td>228b6c5eec1f026d1452c12596a79253</td>\n",
       "      <td>0</td>\n",
       "      <td>40</td>\n",
       "      <td>228b6c5eec1f026d1452c12596a79253</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  init          headckid fromSearchBtn                         ckid  \\\n",
       "0   -1  3535f118c68812b6             2  3535f118c68812b6°radeFlag=0   \n",
       "1   -1  3535f118c68812b6             2  3535f118c68812b6°radeFlag=0   \n",
       "2   -1  3535f118c68812b6             2  3535f118c68812b6°radeFlag=0   \n",
       "3   -1  3535f118c68812b6             2  3535f118c68812b6°radeFlag=0   \n",
       "4   -1  3535f118c68812b6             2  3535f118c68812b6°radeFlag=0   \n",
       "5   -1  3535f118c68812b6             2  3535f118c68812b6°radeFlag=0   \n",
       "\n",
       "                                           siTag         d_sfrom  \\\n",
       "0  1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw  search_unknown   \n",
       "1  1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw  search_unknown   \n",
       "2  1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw  search_unknown   \n",
       "3  1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw  search_unknown   \n",
       "4  1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw  search_unknown   \n",
       "5  1B2M2Y8AsgTpgAmY7PhCfg~fA9rXquZc5IkJpXC-Ycixw  search_unknown   \n",
       "\n",
       "                             d_ckId d_curPage d_pageSize  \\\n",
       "0  228b6c5eec1f026d1452c12596a79253         0         40   \n",
       "1  228b6c5eec1f026d1452c12596a79253         0         40   \n",
       "2  228b6c5eec1f026d1452c12596a79253         0         40   \n",
       "3  228b6c5eec1f026d1452c12596a79253         0         40   \n",
       "4  228b6c5eec1f026d1452c12596a79253         0         40   \n",
       "5  228b6c5eec1f026d1452c12596a79253         0         40   \n",
       "\n",
       "                           d_headId curPage  \n",
       "0  228b6c5eec1f026d1452c12596a79253       1  \n",
       "1  228b6c5eec1f026d1452c12596a79253       2  \n",
       "2  228b6c5eec1f026d1452c12596a79253       3  \n",
       "3  228b6c5eec1f026d1452c12596a79253       4  \n",
       "4  228b6c5eec1f026d1452c12596a79253       1  \n",
       "5  228b6c5eec1f026d1452c12596a79253       9  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "init             1\n",
      "headckid         1\n",
      "fromSearchBtn    1\n",
      "ckid             1\n",
      "siTag            1\n",
      "d_sfrom          1\n",
      "d_ckId           1\n",
      "d_curPage        1\n",
      "d_pageSize       1\n",
      "d_headId         1\n",
      "curPage          5\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "from urllib.parse import urlparse, parse_qs\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.DataFrame([urlparse(x) for x in 页面url.values()])\n",
    "df_qs = pd.DataFrame([{k: v[0]\n",
    "                       for k, v in parse_qs(x).items()} for x in df['query']])\n",
    "display(df_qs)\n",
    "print(df_qs.nunique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 使curPage变成整数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1\n",
       "1    2\n",
       "2    3\n",
       "3    4\n",
       "4    1\n",
       "5    9\n",
       "Name: curPage, dtype: int32"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "curPage_int=df_qs.curPage.astype(int)\n",
    "curPage_int"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
